import os
import bz2
import gzip
from pysam import VariantFile


directory = "/osc-fs_home/scratch/mdehoon/Data/UCSC/hg38"
filename = "chromAlias.txt.gz"
path = os.path.join(directory, filename)
print("Reading", path)
stream = gzip.open(path, "rt")
names = {}
for line in stream:
    words = line.split()
    if words[2] == "refseq":
        name = words[0]
        chrID = words[1]
        names[chrID] = name
stream.close()

directory = "/osc-fs_home/scratch/mdehoon/Data/NCBI/dbSNP/"
filename = "GCF_000001405.38.bcf"
path = os.path.join(directory, filename)
print("Reading", path)
bcf = VariantFile(path)

filename = "gwas.vcf"
print("Writing", filename)
output = VariantFile(filename, 'w', header=bcf.header)

directory = "/osc-fs_home/scratch/mdehoon/Data/EBI"
filename = "gwas_catalog_v1.0-associations_e105_r2022-04-07.tsv.bz2"

path = os.path.join(directory, filename)
print("Reading", path)
lines = bz2.open(path, "rt")
line = next(lines)
words = line.strip().split("\t")
assert words[0] == "DATE ADDED TO CATALOG"
assert words[1] == "PUBMEDID"
assert words[2] == "FIRST AUTHOR"
assert words[3] == "DATE"
assert words[4] == "JOURNAL"
assert words[5] == "LINK"
assert words[6] == "STUDY"
assert words[7] == "DISEASE/TRAIT"
assert words[8] == "INITIAL SAMPLE SIZE"
assert words[9] == "REPLICATION SAMPLE SIZE"
assert words[10] == "REGION"
assert words[11] == "CHR_ID"
assert words[12] == "CHR_POS"
assert words[13] == "REPORTED GENE(S)"
assert words[14] == "MAPPED_GENE"
assert words[15] == "UPSTREAM_GENE_ID"
assert words[16] == "DOWNSTREAM_GENE_ID"
assert words[17] == "SNP_GENE_IDS"
assert words[18] == "UPSTREAM_GENE_DISTANCE"
assert words[19] == "DOWNSTREAM_GENE_DISTANCE"
assert words[20] == "STRONGEST SNP-RISK ALLELE"
assert words[21] == "SNPS"
assert words[22] == "MERGED"
assert words[23] == "SNP_ID_CURRENT"
assert words[24] == "CONTEXT"
assert words[25] == "INTERGENIC"
assert words[26] == "RISK ALLELE FREQUENCY"
assert words[27] == "P-VALUE"
assert words[28] == "PVALUE_MLOG"
assert words[29] == "P-VALUE (TEXT)"
assert words[30] == "OR or BETA"
assert words[31] == "95% CI (TEXT)"
assert words[32] == "PLATFORM [SNPS PASSING QC]"
assert words[33] == "CNV"
written = 0
skipped = 0
for line in lines:
    words = line.strip().split("\t")
    assert len(words) == 34
    chromosome_numbers = words[11]
    positions = words[12]
    if not chromosome_numbers:
        assert not positions
        continue
    if " x " in chromosome_numbers:
        chromosome_numbers = chromosome_numbers.split(" x ")
        positions = positions.split(" x ")
    else:
        chromosome_numbers = chromosome_numbers.split(";")
        positions = positions.split(";")
    snps = words[21]
    for position, chromosome_number in zip(positions, chromosome_numbers):
        start = int(position) - 1
        end = start + 1
        key = "chr%s" % chromosome_number
        name = names[key]
        records = bcf.fetch(name, start, end)
        for record in records:
            if record.id == snps:
                break
        else:
            # raise ValueError("Failed to find %s in bcf file" % snps)
            skipped += 1
            continue
        output.write(record)
        written += 1
output.close()

print("%d records written, %d lines skipped" % (written, skipped))
